{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pip\n",
    "\n",
    "# If you don't have ClearML installed then uncomment this line\n",
    "# ! pip install -U clearml==0.16.2rc0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pandas==1.0.4\n",
    "! pip install -U scikit-learn==0.23.1\n",
    "! pip install -U pathlib2==2.3.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib2 import Path\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from clearml import Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = Task.init(\n",
    "    project_name=\"Tabular Example\", task_name=\"Download and split tabular dataset\"\n",
    ")\n",
    "logger = task.get_logger()\n",
    "configuration_dict = {\"test_size\": 0.1, \"split_random_state\": 0}\n",
    "configuration_dict = task.connect(\n",
    "    configuration_dict\n",
    ")  # enabling configuration override by clearml\n",
    "print(\n",
    "    configuration_dict\n",
    ")  # printing actual configuration (after override in remote mode)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Downloading**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
    "# and save it to your cloud storage or your mounted local storage\n",
    "# If the data is on your cloud storage, you can use clearml' storage manager to get a local copy of it:\n",
    "#    from clearml.storage import StorageManager\n",
    "#    path_to_ShelterAnimal = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/clearml/UrbanSound8K.zip\",\n",
    "#                                                          extract_archive=True)\n",
    "path_to_ShelterAnimal = \"/home/sam/Datasets/shelter-animal-outcomes\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / \"train.csv\")\n",
    "logger.report_table(\n",
    "    title=\"Trainset - raw\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Splitting to train and val**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = train_set.drop(columns=[\"OutcomeType\"])\n",
    "Y = train_set[\"OutcomeType\"]\n",
    "X_train, X_val, Y_train, Y_val = train_test_split(\n",
    "    X,\n",
    "    Y,\n",
    "    test_size=configuration_dict.get(\"test_size\", 0.1),\n",
    "    random_state=configuration_dict.get(\"split_random_state\", 0),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = X_train.join(Y_train)\n",
    "val_df = X_val.join(Y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task.upload_artifact(\"train_data\", artifact_object=train_df)\n",
    "task.upload_artifact(\"val_data\", artifact_object=val_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
